In [136]:
%matplotlib inline
#%pprint ON
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit

data_dict = pickle.load(open("final_project_dataset.pkl", "r") )
df = pd.DataFrame.from_dict(data_dict)
df = df.transpose()
df = df.replace('NaN', np.nan) 
df=df.dropna(thresh=2,axis=0)
df.drop('email_address', 1,inplace=True)
df.drop('TOTAL', 0,inplace=True)
df.drop('THE TRAVEL AGENCY IN THE PARK', 0,inplace=True)

df = df.convert_objects(convert_numeric=True)

#df = df.apply(lambda x: x.fillna(x.mean()),axis=0)
In [137]:
# Uncomment to plot all features.... This was used to review and eliminate outliers. 

for column in list(df):
    if column not in ['email_address','poi','director_fees']:
        if column not in ['email_address','poi']:
            print df[[column]].sort([column], ascending=[0]).head(3)
            plt.figure(figsize=(26,6))
            
        
            #print df.color
            
            df[column].plot(style='.')
            
            x = range(len(df[column]))
            plt.xticks(x,df.index)
            locs, labels = plt.xticks()
            #print labels
            plt.setp(labels, rotation=90)
            plt.title(column)
            plt.show()
                      bonus
LAVORATO JOHN J     8000000
LAY KENNETH L       7000000
SKILLING JEFFREY K  5600000

                  deferral_payments
FREVERT MARK A              6426990
HORTON STANLEY C            3131860
HUMPHREY GENE E             2964506

                    deferred_income
BOWEN JR RAYMOND M             -833
GAHN ROBERT S                 -1042
SHELBY REX                    -4167

                exercised_stock_options
LAY KENNETH L                  34348384
HIRKO JOSEPH                   30766064
RICE KENNETH D                 19794175

                    expenses
MCCLELLAN GEORGE      228763
URQUHART JOHN A       228656
SHANKMAN JEFFREY A    178979

                     from_messages
KAMINSKI WINCENTY J          14368
KEAN STEVEN J                 6759
BECK SALLY W                  4343

                  from_poi_to_this_person
LAVORATO JOHN J                       528
DIETRICH JANET R                      305
KITCHEN LOUISE                        251

                  from_this_person_to_poi
DELAINEY DAVID W                      609
LAVORATO JOHN J                       411
KEAN STEVEN J                         387

                  loan_advances
LAY KENNETH L          81525000
FREVERT MARK A          2000000
PICKERING MARK R         400000

                 long_term_incentive
MARTIN AMANDA K              5145434
LAY KENNETH L                3600000
ECHOLS JOHN B                2234774

                    other
LAY KENNETH L    10359729
FREVERT MARK A    7427621
MARTIN AMANDA K   2818454

                   restricted_stock
LAY KENNETH L              14761694
WHITE JR THOMAS E          13847074
PAI LOU L                   8453763

                  restricted_stock_deferred
BHATNAGAR SANJAY                   15456290
BELFER ROBERT                         44093
CHAN RONNIE                          -32460

                     salary
SKILLING JEFFREY K  1111258
LAY KENNETH L       1072321
FREVERT MARK A      1060932

                   shared_receipt_with_poi
BELDEN TIMOTHY N                      5521
SHAPIRO RICHARD S                     4527
LAVORATO JOHN J                       3962

                   to_messages
SHAPIRO RICHARD S        15149
KEAN STEVEN J            12754
KITCHEN LOUISE            8305

                  total_payments
LAY KENNETH L          103559793
FREVERT MARK A          17252530
BHATNAGAR SANJAY        15456290

                    total_stock_value
LAY KENNETH L                49110078
HIRKO JOSEPH                 30766064
SKILLING JEFFREY K           26093672

In [138]:
features_list = ['poi']
all_features = list(df)
all_features.remove('poi')
features_list.extend(all_features)

#my_dataset = df.T.to_dict('dict')

### Extract features and labels from dataset for local testing
#data = featureFormat(my_dataset, features_list, sort_keys = True)
df.loc[df['poi'] == True, 'color'] = 'r'
df.loc[df['poi'] == False, 'color'] = 'b'

for x in all_features:
    if x != 'color':
        for y in all_features:
            if y != 'color':
                #print x,y
                plt.scatter(x=df[x],y=df[y],c=df['color'])
                plt.title(x + ' vs ' + y)
                plt.xlabel(x)
                plt.ylabel(y)
                plt.show()
In [138]: